#loading packages
library(ezids)
library(ggplot2)
library(ggrepel)
library(gridExtra)
library(tibble)
library(dplyr)
library(tidyr)
library(psych)
#loading data
NYweath <- data.frame(read.csv("data/NYC_weather_1869_2022.csv"))
#converting to R date format and adding columns for day, month, and year
NYweath$DATE <- as.Date(NYweath$DATE)
NYweath$day <- format(NYweath$DATE, format="%d")
NYweath$month <- format(NYweath$DATE, format="%m")
NYweath$year <- format(NYweath$DATE, format="%Y")
#converting temperature observations to numerical
NYweath$TMAX <- as.numeric(NYweath$TMAX)
NYweath$TMIN <- as.numeric(NYweath$TMIN)
NYweath$TAVG <- as.numeric(NYweath$TAVG)
NYweath$year <- as.numeric(NYweath$year)
#Making month a factor
NYweath$month <- as.factor(NYweath$month)
# subset data to desired variables
NYweath_sub <- subset(NYweath, select = c(DATE, day, month, year, TMAX, TMIN, TAVG, PRCP, SNOW))
#creating a subset for 1900 on
NYweath_00 <- subset(NYweath_sub, year > 1899)
xkabledplyhead(NYweath_00)
| DATE | day | month | year | TMAX | TMIN | TAVG | PRCP | SNOW | |
|---|---|---|---|---|---|---|---|---|---|
| 11265 | 1900-01-01 | 01 | 01 | 1900 | 20 | 14 | NA | 0.03 | 0.5 |
| 11266 | 1900-01-02 | 02 | 01 | 1900 | 23 | 13 | NA | 0.00 | 0.0 |
| 11267 | 1900-01-03 | 03 | 01 | 1900 | 26 | 19 | NA | 0.00 | 0.0 |
| 11268 | 1900-01-04 | 04 | 01 | 1900 | 32 | 19 | NA | 0.00 | 0.0 |
| 11269 | 1900-01-05 | 05 | 01 | 1900 | 39 | 29 | NA | 0.00 | 0.0 |
CW ADD: Adding a ‘TOT_PRCP’ row that sums up the total precipitation between SNOW and PRCP. This row will be used in Question 3.
NYweath_final <- NYweath_00
NYweath_final$TOT_PRCP <- NYweath_00$PRCP + NYweath_00$SNOW
In this project, we are digging into the relationship between human activity and weather in New York city. Our three driving questions are:
How do changes in NYC weather patterns correlate to changes in population and economic activity over the same time frame?
How do changes in NYC weather patterns correlate to changes in other urban climate factors such as air quality?
How do changes in weather patterns correlate to other local human activity, such as crime, reported COVID cases, and stock market performance?
Emily will re-do logistic regression looking at measures of local and global human activity as regressors rather than year. She will also look into variable transformations (i.e., linear models fit to polynomials of regressors) to see if the response is best fit as linear or polynomial.
Tejas will look at air local air quality data and how it relates to precipitation trends and local human activities. (Will include logistic regression.)
Chris will look at the local weather data and how it affects human behavior. Will look for correlations between precipitation, temperature, stock market, COVID tests, and crime
First step is to Transform Precip to Yes/No Factor Var. Will use PRCP_TOT to account for all PRCP.
# Add a column to convert PRCP to a binary factor variable. Don't care how much it rains, only if it rains.
NYweath_prcpFact <-NYweath_final
NYweath_prcpFact$PRCP_factor <- cut(NYweath_final$TOT_PRCP, c(-Inf,0, Inf), labels = c(0,1))
NYweath_prcpFact$PRCP_factor <- as.factor(NYweath_prcpFact$PRCP_factor)
Initial import of the data. Due to the size of the data, I imported it once, aggregated the data into a new data frame that only includes date and arrest count. This was exported and saved in the Git. The code below imports directly from that aggregated dataset.
#NYcrime <- data.frame(read.csv("/Users/christopherwasher/Documents/DATS6101/NYPD_Arrests_Data__Historic_.csv"))
#NYcrime_agg <- NYcrime %>% count(ARREST_DATE)
NYcrime_count <- tibble(read.csv("./data/NYPD_Arrests_counts.csv"))
NYcrime_count$ARREST_DATE <- as.Date(NYcrime_count$ARREST_DATE, format = "%Y-%m-%d")
#NYcrime_count$day <- format(NYcrime_count$ARREST_DATE, format="%d")
#NYcrime_count$month <- format(NYcrime_count$ARREST_DATE, format="%m")
#NYcrime_count$year <- format(NYcrime_count$ARREST_DATE, format="%Y")
colnames(NYcrime_count)[2] <- "ARREST_DATE"
head(NYcrime_count)
## # A tibble: 6 × 6
## X ARREST_DATE NUM_ARREST day month year
## <int> <date> <int> <int> <int> <int>
## 1 1 2006-01-01 551 1 1 2006
## 2 2 2007-01-01 589 1 1 2007
## 3 3 2008-01-01 769 1 1 2008
## 4 4 2009-01-01 764 1 1 2009
## 5 5 2010-01-01 958 1 1 2010
## 6 6 2011-01-01 900 1 1 2011
Now will do summary statistics and basic EDA on the Crime Count data
crime_plot <- plot(NYcrime_count$ARREST_DATE, NYcrime_count$NUM_ARREST)
crime_boxplot <- boxplot(NYcrime_count$NUM_ARREST)
Add the Crime data to the NY Weather Data, subsetting the weather data
to after 1/1/2022
crimeWeather <- subset(NYweath_prcpFact, year >= 2006 & year < 2022)
NYcrime_count <- NYcrime_count[order(NYcrime_count$ARREST_DATE),]
tail(crimeWeather)
## DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 55819 2021-12-26 26 12 2021 51 39 NA 0.00 0 0.00 0
## 55820 2021-12-27 27 12 2021 39 34 NA 0.09 0 0.09 1
## 55821 2021-12-28 28 12 2021 47 36 NA 0.05 0 0.05 1
## 55822 2021-12-29 29 12 2021 44 41 NA 0.14 0 0.14 1
## 55823 2021-12-30 30 12 2021 49 43 NA 0.05 0 0.05 1
## 55824 2021-12-31 31 12 2021 55 48 NA 0.00 0 0.00 0
NYweath_crime <- cbind(crimeWeather, NYcrime_count$NUM_ARREST)
colnames(NYweath_crime)[12] <- c("NUM_ARREST")
#NYweath_crime_plot <- plot(sqrt(NYweath_crime$PRCP), NYweath_crime$NUM_ARREST)
#boxplot((NYweath_crime$TOT_PRCP))
NY_weathcrime_ggplot <- ggplot(NYweath_crime,
aes(x = TMAX, y =NUM_ARREST)) +
geom_point(aes(colour = PRCP_factor), alpha = 0.5) +
labs(x = "Temperature (ºF)", y = "Number of Daily Arrests",
title = "Weather Patterns for NYC Crime")
NY_weathcrime_ggplot
Initially made a boxplot of precipitation to observe the distribution..
It is extremely skewed. However, because I’m only interested in
determining if precipitation has an effect, will build a linear model
using PRCP as a Factor.
crimeWeath_lm <- lm(NUM_ARREST ~ TMAX + PRCP_factor + year,
data = NYweath_crime)
crimeWeathMonth_lm <- lm(NUM_ARREST ~ (TMAX + PRCP_factor + year + month),
data = NYweath_crime)
#crimeWeathTMIN_lm <- lm(NUM_ARREST ~ (TMIN + PRCP_factor),
# data = NYweath_crime)
summary(crimeWeathMonth_lm)
##
## Call:
## lm(formula = NUM_ARREST ~ (TMAX + PRCP_factor + year + month),
## data = NYweath_crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1029.2 -190.2 0.3 202.7 675.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97442.248 1530.816 63.65 < 2e-16 ***
## TMAX 2.338 0.404 5.79 7.6e-09 ***
## PRCP_factor1 -46.130 7.368 -6.26 4.1e-10 ***
## year -47.963 0.760 -63.07 < 2e-16 ***
## month02 15.163 17.434 0.87 0.3845
## month03 3.697 17.531 0.21 0.8330
## month04 -52.883 19.351 -2.73 0.0063 **
## month05 -69.008 21.337 -3.23 0.0012 **
## month06 -125.430 23.518 -5.33 1.0e-07 ***
## month07 -160.821 25.009 -6.43 1.4e-10 ***
## month08 -131.473 24.384 -5.39 7.3e-08 ***
## month09 -144.147 22.672 -6.36 2.2e-10 ***
## month10 -82.766 19.830 -4.17 3.0e-05 ***
## month11 -135.518 18.046 -7.51 6.8e-14 ***
## month12 -205.302 17.138 -11.98 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 268 on 5829 degrees of freedom
## Multiple R-squared: 0.427, Adjusted R-squared: 0.426
## F-statistic: 310 on 14 and 5829 DF, p-value: <2e-16
The Linear model of Arrest Numbers as a result of temperature and precipitation. The Coefficients are significant but the R^2 is 0. This indicates there is a statistically significant relationship between Arrests and TMAX and Precipitation but these variables do not explain any of the variability in the data. Increasing TMAX correlated with an increase in Arrests. And PRCP present is associated with a decreased number of arrests.
Import the stock market data and convert the date column to a ‘Date’ data type. Also pulled out the ‘day’, ‘month’, and ‘year’ columns to help in analysis.
One last note, will need to fill in the missing date and populated the other columns with ‘NAs’. This will enable us to combine the stocks data with the weather data.
NYstock <- tibble(read.csv("./data/Dow Jones Industrial Average Historical Data.csv"))
tail(NYstock)
## # A tibble: 6 × 7
## Date Price Open High Low Vol. Change..
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 11/11/22 33,749.18 33,797.75 33,815.82 33,394.49 418.43M 0.11%
## 2 11/14/22 33,537.16 33,645.97 33,963.62 33,534.27 339.05M -0.63%
## 3 11/15/22 33,594.17 33,804.97 33,984.90 33,321.26 380.64M 0.17%
## 4 11/16/22 33,556.60 33,554.93 33,682.39 33,521.23 288.59M -0.11%
## 5 11/17/22 33,547.37 33,277.00 33,615.82 33,245.78 301.53M -0.03%
## 6 11/18/22 33,747.14 33,606.59 33,827.83 33,541.07 296.37M 0.60%
NYstock$Date <- as.Date(NYstock$Date, format = "%m/%d/%y")
NYstock2 <- NYstock
NYstock2 <- NYstock2 %>%
complete(Date = seq.Date(min(Date), max(Date), by="day"))
options(scientific=T, digits = 10)
# This is all just test code for figuring out how to clean the data.
# Not part of final script.
#NYstocktest <- NYstock2
#NYstocktest$Vol. = substr(NYstocktest$Vol.,1,nchar(NYstocktest$Vol.)-1)
#tail(NYstocktest)
#NYstocktest$Price <- gsub(",", "", NYstocktest$Price)
#NYstocktest[3:5] <- lapply(NYstocktest[3:5], gsub, pattern = ",", replacement = "")
#NYstocktest$Change.. <- gsub("%", "", NYstocktest$Change..)
#NYstocktest[2:7] <- sapply(NYstocktest[2:7], as.numeric)
###
NYstock2$Vol. = substr(NYstock2$Vol., 1, nchar(NYstock2$Vol.) - 1)
NYstock2[2:5] <- lapply(NYstock2[2:5], gsub, pattern = ",", replacement = "")
NYstock2$Change.. <- gsub("%", "", NYstock2$Change..)
NYstock2[2:7] <- sapply(NYstock2[2:7], as.numeric)
NYstock2$day <- format(NYstock2$Date, format="%d")
NYstock2$month <- format(NYstock2$Date, format="%m")
NYstock2$year <- format(NYstock2$Date, format="%Y")
head(NYstock2)
## # A tibble: 6 × 10
## Date Price Open High Low Vol. Change.. day month year
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 1979-12-25 839. 839. 842. 833. NA 0 25 12 1979
## 2 1979-12-26 838. 839. 843. 834. NA -0.12 26 12 1979
## 3 1979-12-27 840. 838. 843. 834. NA 0.23 27 12 1979
## 4 1979-12-28 839. 840. 843. 835. NA -0.14 28 12 1979
## 5 1979-12-29 NA NA NA NA NA NA 29 12 1979
## 6 1979-12-30 NA NA NA NA NA NA 30 12 1979
summary(NYstock2)
## Date Price Open
## Min. :1979-12-25 Min. : 759.13 Min. : 759.980
## 1st Qu.:1990-09-15 1st Qu.: 2733.83 1st Qu.: 2733.387
## Median :2001-06-06 Median : 9447.96 Median : 9438.680
## Mean :2001-06-06 Mean :10214.49 Mean :10212.138
## 3rd Qu.:2012-02-26 3rd Qu.:13171.78 3rd Qu.:13170.677
## Max. :2022-11-18 Max. :36799.65 Max. :36722.600
## NA's :4848 NA's :4848
## High Low Vol.
## Min. : 767.410 Min. : 729.950 Min. : 1.5900
## 1st Qu.: 2747.680 1st Qu.: 2715.142 1st Qu.: 40.3575
## Median : 9521.945 Median : 9348.660 Median :152.8850
## Mean :10274.204 Mean :10147.937 Mean :168.8159
## 3rd Qu.:13242.507 3rd Qu.:13098.073 3rd Qu.:257.3475
## Max. :36952.650 Max. :36636.000 Max. :922.6800
## NA's :4848 NA's :4848 NA's :6878
## Change.. day month year
## Min. :-22.610000 Length:15670 Length:15670 Length:15670
## 1st Qu.: -0.460000 Class :character Class :character Class :character
## Median : 0.050000 Mode :character Mode :character Mode :character
## Mean : 0.040395
## 3rd Qu.: 0.570000
## Max. : 11.370000
## NA's :4848
options(scientific=T, digits = 3)
Really only care about the volume of data. will remove all other columns and only work with Date + Vol. Will combine that witht he weather data for further analysis.
NYstock_final <- NYstock2[,c("Date", "Vol.")]
NYstock_final <- subset(NYstock_final, Date <= "2022-09-26")
weather_stockDates <- subset(NYweath_prcpFact, DATE >= "1979-12-25")
stockWeather <- cbind(weather_stockDates, NYstock_final)
colnames(stockWeather)[13] <- c("Volume")
Now will do EDA on the the volume data to build a linear regression model. First will look at normality and look for any correlations.
stockWeather_rmNA <- subset(stockWeather, !is.na(Volume))
stock_hist <- hist(stockWeather_rmNA$Volume)
Histogram shows the data is right skewed. Will use sqrt of the volume to normalize.
stockWeather_rmNA$Volume_norm <- sqrt(stockWeather_rmNA$Volume)
stockWeather_rmNA <- subset(stockWeather_rmNA, select = -c(Date))
stockWeather_90s <- subset(stockWeather_rmNA, year >= 1988 & year <= 1999)
hist(stockWeather_rmNA$Volume_norm)
boxplot(stockWeather_rmNA$Volume_norm)
The distribution of sqrt Volume is considerably more normal. Will now look at correlations with Weather data. The boxplot shows there are no outliers after normalizing the data.
pairs.panels(stockWeather_rmNA[c("TMAX", "TOT_PRCP","PRCP_factor",
"Volume","Volume_norm")],
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = FALSE, # show density plots
ellipses = FALSE # show correlation ellipses
)
There are no strong correlations present in the data. Will next look at a scatter plot of the Stock Volume vs TMAX, categorized by days with PRCP.
NY_weathstock_scatter <- ggplot(stockWeather_rmNA, aes(x = TMAX, y =Volume_norm)) +
geom_point(aes(colour = PRCP_factor)) +
labs(x = "Temperature", y = "Total Daily DOW Trade Volume",
title = "Weather Patterns for DOW Trade Volume")
NY_weathstock_scatter
## Trying again with the 90's stock data.
NY_90s_weathstock_scatter <- ggplot(stockWeather_90s, aes(x = TMAX, y =Volume_norm)) +
geom_point(aes(colour = PRCP_factor)) +
labs(x = "Temperature (ºF)", y = "Normalized Daily DOW Trade Volume (M)",
title = "Weather Patterns for DOW Trade Volume in the 1990s")
NY_90s_weathstock_scatter
stock_LM <- lm(Volume_norm ~ TMAX + PRCP_factor + year + month,
stockWeather_rmNA)
summary(stock_LM)
##
## Call:
## lm(formula = Volume_norm ~ TMAX + PRCP_factor + year + month,
## data = stockWeather_rmNA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.414 -2.203 -0.663 2.724 14.811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.92e+02 7.64e+00 -103.76 < 2e-16 ***
## TMAX -8.70e-03 4.31e-03 -2.02 0.04389 *
## PRCP_factor1 -2.84e-02 8.04e-02 -0.35 0.72375
## year 4.02e-01 3.81e-03 105.40 < 2e-16 ***
## month02 -2.86e-01 1.92e-01 -1.49 0.13696
## month03 1.53e-01 1.91e-01 0.80 0.42478
## month04 -2.01e-01 2.12e-01 -0.95 0.34321
## month05 -3.62e-01 2.31e-01 -1.57 0.11699
## month06 -2.62e-01 2.54e-01 -1.03 0.30286
## month07 -5.68e-01 2.71e-01 -2.10 0.03600 *
## month08 -9.47e-01 2.65e-01 -3.58 0.00035 ***
## month09 -2.91e-01 2.46e-01 -1.18 0.23692
## month10 -1.11e-01 2.16e-01 -0.52 0.60609
## month11 -4.67e-01 2.01e-01 -2.33 0.02003 *
## month12 -6.34e-01 1.90e-01 -3.34 0.00084 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.57 on 8738 degrees of freedom
## Multiple R-squared: 0.562, Adjusted R-squared: 0.561
## F-statistic: 801 on 14 and 8738 DF, p-value: <2e-16
stock_LM_90s <- lm(Volume_norm ~ TMAX + PRCP_factor + year + month,
stockWeather_90s)
summary(stock_LM_90s)
##
## Call:
## lm(formula = Volume_norm ~ TMAX + PRCP_factor + year + month,
## data = stockWeather_90s)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.575 -0.746 -0.142 0.596 8.329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.62e+02 1.19e+01 -72.61 < 2e-16 ***
## TMAX 5.16e-03 2.31e-03 2.24 0.02541 *
## PRCP_factor1 -3.52e-02 4.37e-02 -0.81 0.42087
## year 4.35e-01 5.95e-03 73.10 < 2e-16 ***
## month02 -1.51e-01 1.03e-01 -1.46 0.14481
## month03 -2.52e-01 1.02e-01 -2.48 0.01322 *
## month04 -1.40e-01 1.12e-01 -1.24 0.21419
## month05 -4.00e-01 1.22e-01 -3.29 0.00102 **
## month06 -4.46e-01 1.35e-01 -3.31 0.00096 ***
## month07 -4.75e-01 1.44e-01 -3.31 0.00095 ***
## month08 -4.18e-01 1.39e-01 -3.00 0.00270 **
## month09 -2.52e-01 1.29e-01 -1.96 0.05027 .
## month10 8.14e-02 1.13e-01 0.72 0.47338
## month11 4.88e-02 1.06e-01 0.46 0.64464
## month12 -2.97e-02 1.01e-01 -0.30 0.76783
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.13 on 3018 degrees of freedom
## Multiple R-squared: 0.641, Adjusted R-squared: 0.64
## F-statistic: 385 on 14 and 3018 DF, p-value: <2e-16
The Linear Model that incorporates all the Stock data from 1988-present had a statistically significant TMAX, year, and some of the months.
The stock data subset to the 90s generated a similar model.
Find the confidence intervals of the coefficients.
Looking at the effect of precipitation and temperature on the number of positive COVID cases. Using the “CASE_COUNT” parameter for the NYC Covid dataset. CASE_COUNT represents the count of patients tested who were confirmed to be COVID-19 cases on date_of_interest
options(scientific=T, digits = 3)
NYcovid <- tibble(read.csv("./data/COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv"))
NYcovid <- select(NYcovid, 1:3)
head(NYcovid)
## # A tibble: 6 × 3
## date_of_interest CASE_COUNT PROBABLE_CASE_COUNT
## <chr> <int> <int>
## 1 02/29/2020 1 0
## 2 03/01/2020 0 0
## 3 03/02/2020 0 0
## 4 03/03/2020 1 0
## 5 03/04/2020 5 0
## 6 03/05/2020 3 0
colnames(NYcovid)[1] <- "DATE"
NYcovid$DATE <- as.Date(NYcovid$DATE, format = "%m/%d/%Y")
NYcovid$day <- format(NYcovid$DATE, format="%d")
NYcovid$month <- format(NYcovid$DATE, format="%m")
NYcovid$year <- format(NYcovid$DATE, format="%Y")
head(NYcovid)
## # A tibble: 6 × 6
## DATE CASE_COUNT PROBABLE_CASE_COUNT day month year
## <date> <int> <int> <chr> <chr> <chr>
## 1 2020-02-29 1 0 29 02 2020
## 2 2020-03-01 0 0 01 03 2020
## 3 2020-03-02 0 0 02 03 2020
## 4 2020-03-03 1 0 03 03 2020
## 5 2020-03-04 5 0 04 03 2020
## 6 2020-03-05 3 0 05 03 2020
summary(NYcovid)
## DATE CASE_COUNT PROBABLE_CASE_COUNT day
## Min. :2020-02-29 Min. : 0 Min. : 0 Length:991
## 1st Qu.:2020-11-02 1st Qu.: 574 1st Qu.: 76 Class :character
## Median :2021-07-08 Median : 1437 Median : 343 Mode :character
## Mean :2021-07-08 Mean : 2534 Mean : 478
## 3rd Qu.:2022-03-12 3rd Qu.: 2796 3rd Qu.: 644
## Max. :2022-11-15 Max. :54947 Max. :5882
## month year
## Length:991 Length:991
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Next, Looked at normality of the COVID count data. The counts were extremely skewed to the right. First removed multiple rounds of outliers using the outlierKD2 funciton. After removing all outliers, the data was still skewed right but less extreme. Used a square-root transform to normalize the data.
covid_plot <- plot(NYcovid$DATE, NYcovid$CASE_COUNT)
covid_boxplot <- boxplot(NYcovid$CASE_COUNT)
NYcovid_rmOuts <- outlierKD2(NYcovid, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)
## Outliers identified: 42
## Proportion (%) of outliers: 4.4
## Mean of the outliers: 21841
## Mean without removing outliers: 2534
## Mean if we remove outliers: 1680
## Outliers successfully removed
NYcovid_rmOuts2 <- outlierKD2(NYcovid_rmOuts, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)
## Outliers identified: 15
## Proportion (%) of outliers: 1.6
## Mean of the outliers: 5696
## Mean without removing outliers: 1680
## Mean if we remove outliers: 1615
## Outliers successfully removed
NYcovid_rmOuts3 <- outlierKD2(NYcovid_rmOuts2, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)
## Outliers identified: 7
## Proportion (%) of outliers: 0.8
## Mean of the outliers: 5200
## Mean without removing outliers: 1615
## Mean if we remove outliers: 1588
## Outliers successfully removed
NYcovid_rmOuts4 <- outlierKD2(NYcovid_rmOuts3, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)
## Outliers identified: 2
## Proportion (%) of outliers: 0.2
## Mean of the outliers: 5094
## Mean without removing outliers: 1588
## Mean if we remove outliers: 1581
## Outliers successfully removed
covid_plot <- plot(NYcovid_rmOuts4$DATE, NYcovid_rmOuts4$CASE_COUNT)
tail(NYcovid_rmOuts3)
## # A tibble: 6 × 6
## DATE CASE_COUNT PROBABLE_CASE_COUNT day month year
## <date> <int> <int> <chr> <chr> <chr>
## 1 2022-11-10 1957 621 10 11 2022
## 2 2022-11-11 1580 395 11 11 2022
## 3 2022-11-12 1477 539 12 11 2022
## 4 2022-11-13 1332 514 13 11 2022
## 5 2022-11-14 2861 800 14 11 2022
## 6 2022-11-15 2120 652 15 11 2022
sqrt_count <- sqrt(NYcovid_rmOuts3$CASE_COUNT)
#hist(sqrt_count)
NYcovid_final <- cbind(NYcovid_rmOuts4, sqrt_count)
head(NYcovid_final)
## DATE CASE_COUNT PROBABLE_CASE_COUNT day month year sqrt_count
## 1 2020-02-29 1 0 29 02 2020 1.00
## 2 2020-03-01 0 0 01 03 2020 0.00
## 3 2020-03-02 0 0 02 03 2020 0.00
## 4 2020-03-03 1 0 03 03 2020 1.00
## 5 2020-03-04 5 0 04 03 2020 2.24
## 6 2020-03-05 3 0 05 03 2020 1.73
Add the Covid data to the NY Weather Data, subsetting the weather data
to after 2/29/2022
covWeather <- subset(NYweath_prcpFact, DATE >= ("2020-02-29"))
NYcovid_finaldates <- subset(NYcovid_final, DATE <= "2022-09-26")
tail(covWeather)
## DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 56088 2022-09-21 21 09 2022 80 62 NA 0.00 0 0.00 0
## 56089 2022-09-22 22 09 2022 75 57 NA 0.47 0 0.47 1
## 56090 2022-09-23 23 09 2022 63 51 NA 0.00 0 0.00 0
## 56091 2022-09-24 24 09 2022 69 49 NA 0.00 0 0.00 0
## 56092 2022-09-25 25 09 2022 72 59 NA 1.11 0 1.11 1
## 56093 2022-09-26 26 09 2022 74 58 NA 0.00 0 0.00 0
NYweath_prcpCov <- cbind(covWeather, NYcovid_finaldates$CASE_COUNT,
NYcovid_finaldates$sqrt_count)
colnames(NYweath_prcpCov)[12:13] <- c("CASE_COUNT", "sqrt_count")
covCount_prcp_plot <- plot(NYweath_prcpCov$sqrt_count, sqrt(NYweath_prcpCov$PRCP))
NYweath_cov_final <- NYweath_prcpCov[,c(1:5, 8, 10:13)]
Plot of COV case count vs precipitation. no apparent relationship, however, more interested in effect of precip not so much about the correlation in prcp
T-test comparing Covid positive counts on days with precipitation vs days without prcp.
cov_prcp1 <- subset(NYweath_cov_final, PRCP_factor == 1)
cov_prcp0 <- subset(NYweath_cov_final, PRCP_factor == 0)
cov_count_ttest <- t.test(cov_prcp0$sqrt_count, cov_prcp1$sqrt_count)
cov_count_ttest
##
## Welch Two Sample t-test
##
## data: cov_prcp0$sqrt_count and cov_prcp1$sqrt_count
## t = 0.2, df = 650, p-value = 0.8
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.02 2.49
## sample estimates:
## mean of x mean of y
## 36.3 36.1
cov_count_bplot <- ggplot()+
geom_boxplot(data = NYweath_cov_final,
aes(y = sqrt_count, x = PRCP_factor)) +
labs(title = "COVID Positive Counts")
cov_count_bplot
## Repeating this EDA looking only at Covid cases from 2021+.
cov_2021 <- subset(NYweath_cov_final, year >= 2021)
cov_2021count_bplot <- ggplot()+
geom_boxplot(data = cov_2021, aes(y = sqrt_count, x = PRCP_factor)) +
labs(title = "2021 COVID Positive Counts")
cov_2021count_bplot
cov_2021prcp1 <- subset(cov_2021, PRCP_factor == 1)
cov_2021prcp0 <- subset(cov_2021, PRCP_factor == 0)
cov_2021count_ttest <- t.test(cov_2021prcp0$sqrt_count, cov_2021prcp1$sqrt_count)
cov_2021count_ttest
##
## Welch Two Sample t-test
##
## data: cov_2021prcp0$sqrt_count and cov_2021prcp1$sqrt_count
## t = 1, df = 414, p-value = 0.2
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.06 4.10
## sample estimates:
## mean of x mean of y
## 40.2 38.7
No significant difference in the mean Covid case counts on days with precipitation or without. However, there was a greater difference in the means when only incorporating Covid from 2021+.
covWeath_final_scatter <- ggplot(NYweath_cov_final,
aes(x = TMAX,
y =sqrt_count,
)) +
geom_point(aes(colour = month, shape = PRCP_factor)) +
labs(x = "Temperature",
y = "Square Root of Total Daily DOW Trade Volume",
title = "Weather Patterns for Covid Case Counts")
covWeath_final_scatter
Will now build a linear model that incorporates temperature, precipitation, and Month to predict Covid counts.
library(psych)
pairs.panels(NYweath_cov_final[4:10],
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = FALSE, # show density plots
ellipses = FALSE # show correlation ellipses
)
cov_weathLM <- lm(sqrt_count ~ TMAX + PRCP_factor + year,
data = NYweath_cov_final)
summary(cov_weathLM)
##
## Call:
## lm(formula = sqrt_count ~ TMAX + PRCP_factor + year, data = NYweath_cov_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.45 -9.44 -1.96 10.18 42.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.51e+04 1.25e+03 -12.09 <2e-16 ***
## TMAX -3.11e-01 2.87e-02 -10.82 <2e-16 ***
## PRCP_factor1 -4.84e-01 1.01e+00 -0.48 0.63
## year 7.49e+00 6.17e-01 12.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.3 on 873 degrees of freedom
## (64 observations deleted due to missingness)
## Multiple R-squared: 0.23, Adjusted R-squared: 0.227
## F-statistic: 86.8 on 3 and 873 DF, p-value: <2e-16
cov_weathLM_month <- lm(sqrt_count ~ TMAX + PRCP_factor + year + month,
data = NYweath_cov_final)
summary(cov_weathLM_month)
##
## Call:
## lm(formula = sqrt_count ~ TMAX + PRCP_factor + year + month,
## data = NYweath_cov_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.67 -8.91 -0.68 8.14 41.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.56e+04 1.22e+03 -12.74 < 2e-16 ***
## TMAX -5.53e-02 5.57e-02 -0.99 0.3210
## PRCP_factor1 -6.30e-01 9.33e-01 -0.68 0.4994
## year 7.74e+00 6.05e-01 12.79 < 2e-16 ***
## month02 -1.87e+01 3.03e+00 -6.16 1.1e-09 ***
## month03 -1.70e+01 2.99e+00 -5.67 1.9e-08 ***
## month04 -9.47e+00 3.15e+00 -3.01 0.0027 **
## month05 -1.91e+01 3.41e+00 -5.62 2.6e-08 ***
## month06 -2.63e+01 3.76e+00 -7.00 5.0e-12 ***
## month07 -2.15e+01 3.94e+00 -5.46 6.2e-08 ***
## month08 -2.07e+01 3.89e+00 -5.32 1.3e-07 ***
## month09 -2.32e+01 3.62e+00 -6.39 2.6e-10 ***
## month10 -2.54e+01 3.43e+00 -7.40 3.3e-13 ***
## month11 -1.66e+01 3.22e+00 -5.15 3.2e-07 ***
## month12 2.09e+00 3.31e+00 0.63 0.5289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.2 on 862 degrees of freedom
## (64 observations deleted due to missingness)
## Multiple R-squared: 0.358, Adjusted R-squared: 0.347
## F-statistic: 34.3 on 14 and 862 DF, p-value: <2e-16
cov2021_weathLM <- lm(CASE_COUNT ~ TMAX + PRCP_factor + year + month,
data = cov_2021)
summary(cov2021_weathLM)
##
## Call:
## lm(formula = CASE_COUNT ~ TMAX + PRCP_factor + year + month,
## data = cov_2021)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3423 -829 -67 548 3129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.14e+06 1.92e+05 -5.94 5.0e-09 ***
## TMAX 2.11e-01 5.45e+00 0.04 0.9692
## PRCP_factor1 -7.93e+01 9.24e+01 -0.86 0.3908
## year 5.66e+02 9.50e+01 5.96 4.5e-09 ***
## month02 -1.73e+03 2.47e+02 -7.01 7.0e-12 ***
## month03 -1.81e+03 2.57e+02 -7.03 5.9e-12 ***
## month04 -1.82e+03 2.78e+02 -6.53 1.5e-10 ***
## month05 -1.84e+03 3.07e+02 -6.00 3.6e-09 ***
## month06 -2.28e+03 3.40e+02 -6.70 5.1e-11 ***
## month07 -1.81e+03 3.57e+02 -5.05 5.9e-07 ***
## month08 -1.89e+03 3.58e+02 -5.29 1.8e-07 ***
## month09 -2.28e+03 3.30e+02 -6.92 1.2e-11 ***
## month10 -2.61e+03 3.25e+02 -8.03 5.7e-15 ***
## month11 -2.36e+03 2.92e+02 -8.07 4.2e-15 ***
## month12 -1.02e+03 3.74e+02 -2.73 0.0064 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1060 on 565 degrees of freedom
## (54 observations deleted due to missingness)
## Multiple R-squared: 0.244, Adjusted R-squared: 0.225
## F-statistic: 13 on 14 and 565 DF, p-value: <2e-16
The linear model that only incorporates TMAX, PRCP_factor, and year has statistically significant coefficients for TMAX and year. This indicates the model predicts that the sqrt of covid counts decreases by 0.3 for degree F increase in TMAX.
However, when we account for month, we lose the significance in the TMAX variable. This indicates that the covid cases are more effected by the seasonal changes rather than Temperature.
Let’s try to graph this! That did not work!
#covLM_plot <- covWeath_final_scatter +
# geom_smooth(method = lm, se = FALSE, fullrange = TRUE,
#aes(colour = PRCP_factor))
#covLM_plot
#ggplt
# Plotting multiple Regression Lines
#ggplt+geom_smooth(method=lm,se=FALSE,fullrange=TRUE,
# aes(color=Tree))
#covLM_plot
Let’s start by ingesting the Air Quality to add into this prediction in place of COVID.
#DailyAQ_00_22 <- data.frame(read.csv("data/daily-AQ-NY-00-20.csv"))
#DailyAQ_00_22 <- DailyAQ_00_22[c('Date', 'Daily.Mean.PM2.5.Concentration', #'DAILY_AQI_VALUE')]
#colnames(DailyAQ_00_22) <- c('DATE', 'PM2.5', 'AQI')
#str(DailyAQ_00_22)
#xkablesummary(DailyAQ_00_22)
#xkabledplyhead(DailyAQ_00_22)
Now let’s build a master dataframe that incorporates Date, Year, Month, TMAX, PRCP, PRCP_Factor, Crime Count, DOW Volume, PM2.5, and AQI.
# FORMAT AQ Data and subset dates
#AQ_forLogit <- DailyAQ_00_22
#AQ_forLogit$DATE <- as.Date(AQ_forLogit$DATE, format = "%m/%d/%y")
#AQ_forLogit$day <- format(AQ_forLogit$DATE, format="%d")
#AQ_forLogit$month <- format(AQ_forLogit$DATE, format="%m")
#AQ_forLogit$year <- format(AQ_forLogit$DATE, format="%Y")
#AQ_forLogit$year <- as.numeric(AQ_forLogit$year)
#AQ_forLogit2 <- AQ_forLogit %>%
# complete(DATE = seq.Date(min(DATE), max(DATE), by="day"))
#AQ_masterDates <- subset(AQ_forLogit2, year >= 2006 & year < 2022)
stock_masterDates <- subset(NYstock_final,Date >= "2006-01-01" &
Date <= "2021-12-31")
crime_masterDates <- NYcrime_count
weath_masterDates <- subset(NYweath_prcpFact, year >= 2006 & year < 2022)
master_log <- cbind(weath_masterDates,
crime_masterDates$NUM_ARREST,
stock_masterDates$Vol.)
colnames(master_log)[12:13] <- c('NUM_ARREST', 'Volume')
head(master_log)
## DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 49981 2006-01-01 01 01 2006 42 32 NA 0.00 0 0.00 0
## 49982 2006-01-02 02 01 2006 48 39 NA 0.63 0 0.63 1
## 49983 2006-01-03 03 01 2006 40 33 NA 1.13 0 1.13 1
## 49984 2006-01-04 04 01 2006 38 29 NA 0.00 0 0.00 0
## 49985 2006-01-05 05 01 2006 50 37 NA 0.05 0 0.05 1
## 49986 2006-01-06 06 01 2006 43 30 NA 0.00 0 0.00 0
## NUM_ARREST Volume
## 49981 551 NA
## 49982 618 NA
## 49983 899 303
## 49984 1229 271
## 49985 1383 251
## 49986 1248 292
master_logFinal <- subset(master_log, !is.na(Volume))
master_logFinal$Volume_norm <- sqrt(master_logFinal$Volume)
Now let’s build the LR:
prcp_logit <- glm(PRCP_factor ~ TMAX + NUM_ARREST +
Volume_norm + year + month,
data = master_logFinal,
family = binomial(link = "logit"))
summary(prcp_logit)
##
## Call:
## glm(formula = PRCP_factor ~ TMAX + NUM_ARREST + Volume_norm +
## year + month, family = binomial(link = "logit"), data = master_logFinal)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.295 -0.957 -0.844 1.350 1.707
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 72.931220 20.475973 3.56 0.00037 ***
## TMAX -0.005733 0.003813 -1.50 0.13272
## NUM_ARREST -0.000950 0.000142 -6.67 2.6e-11 ***
## Volume_norm -0.010261 0.008914 -1.15 0.24972
## year -0.035885 0.010121 -3.55 0.00039 ***
## month02 0.242612 0.168088 1.44 0.14892
## month03 0.149488 0.169288 0.88 0.37722
## month04 0.400139 0.185421 2.16 0.03093 *
## month05 0.326598 0.205157 1.59 0.11140
## month06 0.503649 0.222956 2.26 0.02389 *
## month07 0.344674 0.241819 1.43 0.15406
## month08 0.201185 0.236342 0.85 0.39463
## month09 -0.049412 0.222747 -0.22 0.82445
## month10 0.074118 0.193259 0.38 0.70134
## month11 -0.012329 0.178675 -0.07 0.94499
## month12 0.046996 0.169456 0.28 0.78152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5256.9 on 4027 degrees of freedom
## Residual deviance: 5185.4 on 4012 degrees of freedom
## AIC: 5217
##
## Number of Fisher Scoring iterations: 4
Let’s assess the LR!
library(ModelMetrics)
prcpLR_cm <- confusionMatrix(actual = prcp_logit$y,
predicted = prcp_logit$fitted.values)
prcpLR_cm
## [,1] [,2]
## [1,] 2541 1399
## [2,] 43 45
prcpLR_acc <- (prcpLR_cm[2,2] + prcpLR_cm[1,1])/(sum(prcpLR_cm))
prcpLR_prec <- (prcpLR_cm[2,2])/(prcpLR_cm[2,2]+prcpLR_cm[1,2])
prcpLR_rec <- (prcpLR_cm[2,2])/(prcpLR_cm[2,2]+prcpLR_cm[2,1])
prcpLR_spec <- (prcpLR_cm[1,1])/(prcpLR_cm[1,1]+prcpLR_cm[1,2])
library(pROC)
master_logFinal$prob=predict(prcp_logit, type = c("response"))
prcp_roc <- roc(PRCP_factor ~ prob, data = master_logFinal)
prcp_auc <- auc(prcp_roc)
prcp_auc
## Area under the curve: 0.575
plot(prcp_roc)
library(pscl)
prcp_pr2 <- pR2(prcp_logit)
## fitting null model for pseudo-r2
prcp_pr2
## llh llhNull G2 McFadden r2ML r2CU
## -2.59e+03 -2.63e+03 7.15e+01 1.36e-02 1.76e-02 2.41e-02